# ============================================================
# House Prices - Advanced Regression Techniques
# advanced_stacking_split_preprocessing.py
#
# 前処理を分けた「正しいスタッキング」版（RMSLE最適化）
#
# 方針（重要）：
#  - CatBoost はカテゴリを "そのまま" 扱う（One-Hotしない）
#  - 線形/ SVR は One-Hot + 歪度補正(Box-Cox) + RobustScaler
#  - どちらも目的変数は log1p(SalePrice)（RMSLE整合）
#  - StratifiedKFold（y_logをqcutで層化）でOOFを作る
#  - メタモデル（Ridge）がOOF予測から重みを学習（固定重みブレンドより強い）
#
# 注意：
#  - 0.06台は上位勢の領域で保証はできません。
#  - ただ「悪化しやすい原因（CatBoostをOne-Hotで潰す等）」を避け、
#    スタッキングとして筋の良い構成になっています。
#
# 出力：
#  - submission.csv
# 表示：
#  - 各ベースモデルのOOF RMSLE
#  - メタモデルのOOF RMSLE（2段目もCV）
#  - 実行時間（モデル別・合計）
# ============================================================

import time
TOTAL_START = time.time()

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.svm import SVR

from scipy.stats import skew
from scipy.special import boxcox1p

try:
    from catboost import CatBoostRegressor, Pool
except Exception as e:
    raise ImportError(
        "catboost が見つかりません。Kaggle Notebookなら通常入っていますが、"
        "もし無い場合は 'pip install catboost' を実行してください。"
    ) from e


SEED = 42
N_SPLITS = 5

def rmse(a, b) -> float:
    return float(np.sqrt(mean_squared_error(a, b)))


# ============================================================
# 1) Load
# ============================================================
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

test_ID = test["Id"]

# ============================================================
# 2) Outliers (classic)
# ============================================================
# 定番：GrLivAreaが極端に大きい点（主に2点）
outlier_idx = train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index
train = train.drop(outlier_idx).reset_index(drop=True)

y = train["SalePrice"].copy()
X_raw = train.drop(columns=["SalePrice"]).copy()
X_raw_test = test.copy()

y_log = np.log1p(y)

# Stratified bins (log space)
q = 10
y_bins = pd.qcut(y_log, q=q, labels=False, duplicates="drop")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)


# ============================================================
# 3) CatBoost側の前処理（カテゴリそのまま）
# ============================================================
def add_features_cb(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # fill 0 for "absence makes sense"
    for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    df["TotalSF"] = df.get("TotalBsmtSF", 0) + df.get("1stFlrSF", 0) + df.get("2ndFlrSF", 0)

    if "YrSold" in df.columns and "YearBuilt" in df.columns:
        df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    if "YrSold" in df.columns and "YearRemodAdd" in df.columns:
        df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

    for c in ["FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    if all(c in df.columns for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
        df["TotalBath"] = df["FullBath"] + 0.5 * df["HalfBath"] + df["BsmtFullBath"] + 0.5 * df["BsmtHalfBath"]

    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["Qual_x_GrLivArea"] = df["OverallQual"] * df["GrLivArea"]
    if "OverallQual" in df.columns:
        df["Qual_x_TotalSF"] = df["OverallQual"] * df["TotalSF"]

    return df

X_cb = add_features_cb(X_raw)
X_cb_test = add_features_cb(X_raw_test)

cat_cols_cb = X_cb.select_dtypes(include=["object"]).columns.tolist()
cat_idx_cb = [X_cb.columns.get_loc(c) for c in cat_cols_cb]

# fill missing
X_cb[cat_cols_cb] = X_cb[cat_cols_cb].fillna("Missing")
X_cb_test[cat_cols_cb] = X_cb_test[cat_cols_cb].fillna("Missing")

num_cols_cb = X_cb.columns.difference(cat_cols_cb).tolist()
med_cb = X_cb[num_cols_cb].median()
X_cb[num_cols_cb] = X_cb[num_cols_cb].fillna(med_cb)
X_cb_test[num_cols_cb] = X_cb_test[num_cols_cb].fillna(med_cb)


# ============================================================
# 4) 線形/SVR側の前処理（One-Hot + Box-Cox）
# ============================================================
def preprocess_linear(train_df: pd.DataFrame, test_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    all_data = pd.concat((train_df, test_df), axis=0).reset_index(drop=True)

    # columns where "None" is meaningful
    none_cols = [
        "PoolQC","MiscFeature","Alley","Fence","FireplaceQu",
        "GarageType","GarageFinish","GarageQual","GarageCond",
        "BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",
        "MasVnrType"
    ]
    for c in none_cols:
        if c in all_data.columns:
            all_data[c] = all_data[c].fillna("None")

    # numeric zeros for absence
    zero_cols = [
        "GarageYrBlt","GarageArea","GarageCars",
        "BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF",
        "BsmtFullBath","BsmtHalfBath","MasVnrArea"
    ]
    for c in zero_cols:
        if c in all_data.columns:
            all_data[c] = all_data[c].fillna(0)

    # mode fills
    for c in ["MSZoning","Electrical","KitchenQual","Exterior1st","Exterior2nd","SaleType","Functional"]:
        if c in all_data.columns:
            all_data[c] = all_data[c].fillna(all_data[c].mode()[0])

    # LotFrontage by Neighborhood median
    if "LotFrontage" in all_data.columns and "Neighborhood" in all_data.columns:
        all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
            lambda s: s.fillna(s.median())
        )

    # remaining: numeric->median, cat->mode
    for col in all_data.columns:
        if all_data[col].dtype == "object":
            all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
        else:
            all_data[col] = all_data[col].fillna(all_data[col].median())

    # light feature eng
    if set(["TotalBsmtSF","1stFlrSF","2ndFlrSF"]).issubset(all_data.columns):
        all_data["TotalSF"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]
    if set(["YrSold","YearBuilt"]).issubset(all_data.columns):
        all_data["HouseAge"] = all_data["YrSold"] - all_data["YearBuilt"]
    if set(["YrSold","YearRemodAdd"]).issubset(all_data.columns):
        all_data["RemodAge"] = all_data["YrSold"] - all_data["YearRemodAdd"]

    # skew correction
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed = all_data[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    skewed_feats = skewed[abs(skewed) > 0.75].index

    lam = 0.15
    for feat in skewed_feats:
        all_data[feat] = boxcox1p(all_data[feat], lam)

    # one-hot
    all_data = pd.get_dummies(all_data, drop_first=False)

    X_lin = all_data.iloc[: len(train_df)].copy()
    X_lin_test = all_data.iloc[len(train_df) :].copy()
    return X_lin, X_lin_test

X_lin, X_lin_test = preprocess_linear(X_raw, X_raw_test)


# ============================================================
# 5) Base models
# ============================================================
base_models = {
    "ElasticNet": make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=SEED)),
    "Lasso":      make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=SEED)),
    "Ridge":      make_pipeline(RobustScaler(), Ridge(alpha=10.0, random_state=SEED)),
    "SVR":        make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003)),
    "CatBoost":   CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        iterations=20000,
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=3.0,
        random_seed=SEED,
        od_type="Iter",
        od_wait=400,
        verbose=0
    ),
}

# storage
oof = {name: np.zeros(len(X_raw)) for name in base_models}
test_pred = {name: np.zeros(len(X_raw_test)) for name in base_models}
model_time = {name: 0.0 for name in base_models}

# ============================================================
# 6) OOF for each base model (StratifiedKFold)
# ============================================================
for name in base_models:
    start_m = time.time()

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_raw, y_bins), start=1):
        if name == "CatBoost":
            X_tr, X_va = X_cb.iloc[tr_idx], X_cb.iloc[va_idx]
            y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

            train_pool = Pool(X_tr, y_tr, cat_features=cat_idx_cb)
            valid_pool = Pool(X_va, y_va, cat_features=cat_idx_cb)
            test_pool  = Pool(X_cb_test, cat_features=cat_idx_cb)

            m = base_models[name]
            m.fit(train_pool, eval_set=valid_pool, use_best_model=True)

            oof[name][va_idx] = m.predict(valid_pool)
            test_pred[name] += m.predict(test_pool) / N_SPLITS

        else:
            X_tr, X_va = X_lin.iloc[tr_idx], X_lin.iloc[va_idx]
            y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

            m = base_models[name]
            m.fit(X_tr, y_tr)

            oof[name][va_idx] = m.predict(X_va)
            test_pred[name] += m.predict(X_lin_test) / N_SPLITS

    score = rmse(y_log, oof[name])
    model_time[name] = time.time() - start_m
    print(f"{name:10s} | OOF RMSLE: {score:.5f} | time: {model_time[name]/60:.2f} min")


# ============================================================
# 7) Meta model (Ridge) with 2nd-level CV (avoid optimistic bias)
# ============================================================
meta_X = np.column_stack([oof[n] for n in base_models.keys()])
meta_test_X = np.column_stack([test_pred[n] for n in base_models.keys()])

meta = Ridge(alpha=1.0, random_state=SEED)

meta_oof = np.zeros(len(meta_X))
meta_start = time.time()

for fold, (tr_idx, va_idx) in enumerate(skf.split(meta_X, y_bins), start=1):
    meta.fit(meta_X[tr_idx], y_log.iloc[tr_idx])
    meta_oof[va_idx] = meta.predict(meta_X[va_idx])

meta_score = rmse(y_log, meta_oof)
meta_time = time.time() - meta_start
print(f"{'Meta(Ridge)':10s} | OOF RMSLE: {meta_score:.5f} | time: {meta_time/60:.2f} min")

# fit meta on full and predict test
meta.fit(meta_X, y_log)
meta_test_pred = meta.predict(meta_test_X)

# ============================================================
# 8) Submission
# ============================================================
final_pred = np.expm1(meta_test_pred)
submission = pd.DataFrame({"Id": test_ID, "SalePrice": final_pred})
submission.to_csv("submission.csv", index=False)
print("✅ saved: submission.csv")

# ============================================================
# 9) Total time
# ============================================================
total_time = time.time() - TOTAL_START
print("\n--- Timing summary ---")
for k in base_models:
    print(f"{k:10s}: {model_time[k]/60:.2f} min")
print(f"Meta(Ridge): {meta_time/60:.2f} min")
print(f"TOTAL     : {total_time/60:.2f} min")
